Due to gdal package, had to make a separate environment using conda. So install packages for this notebook in that environment itself. Check from the anaconda prompt, the names of all the envs are available:
$ conda info --envs
$ activate env_name
In [1]:
from osgeo import ogr, osr, gdal
import fiona
from shapely.geometry import Point, shape
import numpy as np
import pandas as pd
import os
import sys
import tarfile
import timeit
base_ = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census_2011"
base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
In [2]:
# Change this for Win7,macOS
bases = "C:\Users\deepak\Desktop\Repo\Maps\Districts\Census\Dist.shp"
# base_ = "/Users/macbook/Documents/BTP/Satellite/Data/Maps/Districts/Census_2011"
fc = fiona.open(bases)
In [3]:
def reverse_geocode(pt):
for feature in fc:
if shape(feature['geometry']).contains(pt):
return feature['properties']['DISTRICT']
return "NRI"
In [4]:
# base = "/Users/macbook/Documents/BTP/Satellite/Data/Sat" # macOS
base = "G:\BTP\Satellite\Data\Test" # Win7
In [6]:
def extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(os.path.join(base,root)) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s' % (root, filename))
else:
print('Extracting data for %s' % root)
tar = tarfile.open(os.path.join(base,filename))
sys.stdout.flush()
tar.extractall(os.path.join(base,root))
tar.close()
In [7]:
# extracting all the tar files ... (if not extracted)
for directory, subdirList, fileList in os.walk(base):
for filename in fileList:
if filename.endswith(".tar.gz"):
d = extract(filename)
In [8]:
directories = [os.path.join(base, d) for d in sorted(os.listdir(base)) if os.path.isdir(os.path.join(base, d))]
# print directories
In [9]:
ds = gdal.Open(base + "\LE07_L1TP_146039_20101223_20161211_01_T1\LE07_L1TP_146039_20101223_20161211_01_T1_B1.TIF")
Prepare one ds
variable here itself, for the transformation of the coordinate system below.
In [25]:
# get the existing coordinate system
old_cs= osr.SpatialReference()
old_cs.ImportFromWkt(ds.GetProjectionRef())
# create the new coordinate system
wgs84_wkt = """
GEOGCS["WGS 84",
DATUM["WGS_1984",
SPHEROID["WGS 84",6378137,298.257223563,
AUTHORITY["EPSG","7030"]],
AUTHORITY["EPSG","6326"]],
PRIMEM["Greenwich",0,
AUTHORITY["EPSG","8901"]],
UNIT["degree",0.01745329251994328,
AUTHORITY["EPSG","9122"]],
AUTHORITY["EPSG","4326"]]"""
new_cs = osr.SpatialReference()
new_cs.ImportFromWkt(wgs84_wkt)
# create a transform object to convert between coordinate systems
transform = osr.CoordinateTransformation(old_cs,new_cs)
In [26]:
type(ds)
Out[26]:
In [11]:
def pixel2coord(x, y, xoff, a, b, yoff, d, e):
"""Returns global coordinates from coordinates x,y of the pixel"""
xp = a * x + b * y + xoff
yp = d * x + e * y + yoff
return(xp, yp)
In [12]:
ricep = pd.read_csv("C:\Users\deepak\Desktop\Repo\BTP\Ricep.csv")
ricep = ricep.drop(["Unnamed: 0"],axis=1)
ricep["value"] = ricep["Production"]/ricep["Area"]
ricep.head()
Out[12]:
In [13]:
a = np.empty((ricep.shape[0],1))*np.NAN
The value of n is going to b same for all the 10 Band files of a month hence ame for all the 20 features at a time. Can reduce the number 480 here using this.
In [14]:
""" 'features' contain collumn indexes for the new features """
""" 'dictn' is the dictionary mapping name of collumn index to the index number """
features = []
dictn = {}
k = 13
for i in range(1,13):
for j in range(1,11):
s = str(i) + "_B" + str(j) + "_"
features.append(s+"M")
features.append(s+"V")
dictn[s+"M"] = k
dictn[s+"V"] = k+1
k = k+2
In [15]:
for i in range(1,13):
for j in range(1,11):
s = str(i) + "_B" + str(j) + "_"
features.append(s+"Mn")
features.append(s+"Vn")
In [16]:
len(features)
Out[16]:
In [17]:
tmp = pd.DataFrame(index=range(ricep.shape[0]),columns=features)
ricex = pd.concat([ricep,tmp], axis=1)
In [18]:
ricex.head()
Out[18]:
In [19]:
k = 10
hits = 0
times = [0,0,0,0,0,0,0,0]
nums = [0,0,0,0,0,0,0,0]
bx = False
In [20]:
stx = timeit.default_timer()
for directory in directories:
if bx: continue
else: bx = True
dictx = {}
""" Identifying Month, Year, Spacecraft ID """
date = directory.split('\\')[-1].split('_')[3] # Change for Win7
satx = directory.split('\\')[-1][3]
month = date[4:6]
year = date[0:4]
""" Visiting every GeoTIFF file """
for _,_,files in os.walk(directory):
for filename in files:
# make sure not going into the extra folders
if filename.endswith(".TIF"):
if filename[-5] == '8': continue
#--------------------------------------------------------------------------------------
# Check for a single iteration. Which step takes the longest time.
# improve that step
# Do it all for B1.tif only.
# for others save the row indexes found in B1's iteration
# so dont have to search the dataframe again and again.
# but keep track of the pixels which were not found, so have to skip them too
# so that correct pixel value goes to the correct row in dataframe
# have to traverse the tiff file to get the pixel values
# what all steps are common for all the 10 tif files
# the district search from the pixel's lat,lon coordinates
# the row index search using district and the year
# the same n is read and wrote for all the ...
# If nothing works, maybe we can reduce the number of features, by just visiting
# First 5 TIF files for each scene.
#--------------------------------------------------------------------------------------
print os.path.join(directory,filename)
ds = gdal.Open(os.path.join(directory,filename))
if ds == None: continue
col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
xoff, a, b, yoff, d, e = ds.GetGeoTransform()
""" Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
""" Find the row with same (Year,District), in Crop Dataset. """
""" Find the feature using Month, Band, SATx """
""" For this have to find Mean & Variance """
for i in range(0,col,col/k):
for j in range(0,row,row/k):
st = timeit.default_timer()
########### fetching the lat and lon coordinates
x,y = pixel2coord(i, j, xoff, a, b, yoff, d, e)
lonx, latx, z = transform.TransformPoint(x,y)
times[0] += timeit.default_timer() - st
nums[0] += 1
st = timeit.default_timer()
########### fetching the name of district
district = ""
#----------------------------------------------------------
if filename[-5] == '1':
point = Point(lonx,latx)
district = reverse_geocode(point)
dictx[str(lonx)+str(latx)] = district
else:
district = dictx[str(lonx)+str(latx)]
#----------------------------------------------------------
times[1] += timeit.default_timer() - st
nums[1] += 1
if district == "NRI": continue
st = timeit.default_timer()
########### Locating the row in DataFrame which we want to update
district = district.lower()
district = district.strip()
r = ricex.index[(ricex['ind_district'] == district) & (ricex['Crop_Year'] == int(year))].tolist()
times[3] += timeit.default_timer() - st
nums[3] += 1
if len(r) == 1:
st = timeit.default_timer()
########### The pixel value for that location
px,py = i,j
pix = ds.ReadAsArray(px,py,1,1)
pix = pix[0][0]
times[2] += timeit.default_timer() - st
nums[2] += 1
st = timeit.default_timer()
""" Found the row, so now .."""
""" Find Collumn index corresponding to Month, Band """
hits = hits + 1
#print ("Hits: ", hits)
####### Band Number ########
band = filename.split("\\")[-1].split("_")[7:][0].split(".")[0][1]
bnd = band
if band == '6':
if filename.split("\\")[-1].split("_")[7:][2][0] == '1':
bnd = band
else:
bnd = '9'
elif band == 'Q':
bnd = '10'
sm = month + "_B" + bnd +"_M"
cm = dictn[sm]
r = r[0]
# cm is the collumn indexe for mean
# r[0] is the row index
times[4] += timeit.default_timer() - st
nums[4] += 1
##### Checking if values are null ...
valm = ricex.iloc[r,cm]
if pd.isnull(valm):
st = timeit.default_timer()
ricex.iloc[r,cm] = pix
ricex.iloc[r,cm+1] = pix*pix
ricex.iloc[r,cm+240] = 1
times[5] += timeit.default_timer() - st
nums[5] += 1
continue
st = timeit.default_timer()
##### if the values are not null ...
valv = ricex.iloc[r,cm+1]
n = ricex.iloc[r,cm+240]
n = n+1
times[6] += timeit.default_timer() - st
nums[6] += 1
st = timeit.default_timer()
# Mean & Variance update
ricex.iloc[r,cm] = valm + (pix-valm)/n
ricex.iloc[r,cm+1] = ((n-2)/(n-1))*valv + (pix-valm)*(pix-valm)/n
ricex.iloc[r,cm+240] = n
times[7] += timeit.default_timer() - st
nums[7] += 1
#print ("No match for the district " + district + " for the year " + year)
elapsed = timeit.default_timer() - stx
print (elapsed)
print "Seconds"
Overflow is happening because the pixel value is returned in small int type. Need to convert it into int type.
In [21]:
print hits
Calculating the time per iteration of each code block in the huge loop above helped in recognizing the culprit
In [22]:
print times
print nums
In [23]:
for i in range(8):
x = times[i]/nums[i]
print (str(i) + ": " + str(x))
Observations:
In [23]:
ricex.describe()
Out[23]:
In [22]:
ricex.to_csv("ricex_test1.csv")
In [23]:
fc
Out[23]:
In [6]:
fc.schema
Out[6]:
In [7]:
fc.crs
Out[7]:
In [8]:
len(fc)
Out[8]:
641 Districts in India in total
In [24]:
import timeit
In [27]:
a = timeit.default_timer()
for i in range(1000000):
j = 1
b = timeit.default_timer() - a
In [28]:
b
Out[28]:
DOUBT: Why is the size of LANDSAT 7 file smaller than LANDSAT 8 file? Inspite of the fact that, the number of pixels is equal in the band files for both cases. Investigate this ...
To calculate the relative dimensions of all 10 band files for a scene.
Whatever the dimension, its same for all 10 files, except Band 8, who has 4 times the pixels as any other file
In [10]:
for directory in directories:
""" Identifying Month, Year, Spacecraft ID """
date = directory.split('\\')[-1].split('_')[3] # Change for Win7
satx = directory.split('\\')[-1][3]
month = date[4:6]
year = date[0:4]
print "LANDSAT {}, MONTH: {}, YEAR: {}".format(satx,month,year)
""" Visiting every GeoTIFF file """
for _,_,files in os.walk(directory):
for filename in files:
if filename.endswith(".TIF"):
print filename.split("\\")[-1].split("_")[7:]
ds = gdal.Open(os.path.join(directory,filename))
if ds == None: continue
col, row, _ = ds.RasterXSize, ds.RasterYSize, ds.RasterCount
xoff, a, b, yoff, d, e = ds.GetGeoTransform()
print "Col: {0:6}, Row:{1:6}".format(col,row)
""" Now go to each pixel, find its lat,lon. Hence its district, and the pixel value """
""" Find the row with same (Year,District), in Crop Dataset. """
""" Find the feature using Month, Band, SATx """
""" For this have to find Mean & Variance """
So for LANDSAT 7: col,row ~ 8000,7000, with an exception of Band 8, with 16K,14K
Go to the base folder: extract every zip file, which is unextracted:
For each folder present here:
For each tiff file (for each band):
Identify the following:
- Month, Year
- District Name
- Cloud Cover Percentage
- Sat 7 or 8 (maybe from #files in the folder!
According to SAT, meaning of bands change ...(Put them in corresponding features ...)
Traverse every 100th pixel (for sat7 every Kth)
In [ ]: